/*************************************************************************/
/*								 	 */
/*    Central tree-forming algorithm incorporating all criteria  	 */
/*    ---------------------------------------------------------	 	 */
/*								 	 */
/*************************************************************************/
#ifndef BUU
#define BUU

#include "defns.i"
#include "types.i"
#include "extern.i"
#include "subset.h"
#include "info.h"
#include "contin.h"

ItemCount
	*Weight,	/* Weight[i]  = current fraction of item i */
	**Freq,		/* Freq[x][c] = no. items of class c with outcome x */
	*ValFreq,	/* ValFreq[x]   = no. items with outcome x */
	*ClassFreq;	/* ClassFreq[c] = no. items of class c */

float
	*Gain,		/* Gain[a] = info gain by split on att a */
	*Info,		/* Info[a] = potential info of split on att a */
	*Bar,		/* Bar[a]  = best threshold for contin att a */
	*UnknownRate;	/* UnknownRate[a] = current unknown rate for att a */

Boolean
	*Tested,	/* Tested[a] set if att a has already been tested */
	MultiVal;	/* true when all atts have many values */


/*  External variables initialised here  */

extern float
	*SplitGain,	/* SplitGain[i] = gain with att value of item i as threshold */
	*SplitInfo;	/* SplitInfo[i] = potential info ditto */

extern ItemCount
	*Slice1,	/* Slice1[c]    = saved values of Freq[x][c] in subset.c */
	*Slice2;	/* Slice2[c]    = saved values of Freq[y][c] */

extern Set
	**Subset;	/* Subset[a][s] = subset s for att a */

extern short
	*Subsets;	/* Subsets[a] = no. subsets for att a */

void InitialiseWeights();
void InitialiseTreeData();
Tree FormTree(ItemNo Fp, ItemNo Lp);
ItemNo Group(DiscrValue V, ItemNo Fp, ItemNo Lp, Tree TestNode);
ItemCount CountItems(ItemNo Fp, ItemNo Lp);
void Swap(ItemNo a, ItemNo b);


/*************************************************************************/
/*								 	 */
/*		Allocate space for tree tables			 	 */
/*								 	 */
/*************************************************************************/

void InitialiseTreeData()
{ 
	DiscrValue v;
	Attribute a;

	Tested	= (char *) calloc(MaxAtt+1, sizeof(char));

	Gain	= (float *) calloc(MaxAtt+1, sizeof(float));
	Info	= (float *) calloc(MaxAtt+1, sizeof(float));
	Bar		= (float *) calloc(MaxAtt+1, sizeof(float));

	Subset = (Set **) calloc(MaxAtt+1, sizeof(Set *));
	ForEach(a, 0, MaxAtt)
	{
		if ( MaxAttVal[a] )
		{
			Subset[a]  = (Set *) calloc(MaxDiscrVal+1, sizeof(Set));
			ForEach(v, 0, MaxAttVal[a])
			{
				Subset[a][v] = (Set) malloc((MaxAttVal[a]>>3) + 1);
			}
		}
	}
	Subsets = (short *) calloc(MaxAtt+1, sizeof(short));

	SplitGain = (float *) calloc(MaxItem+1, sizeof(float));
	SplitInfo = (float *) calloc(MaxItem+1, sizeof(float));

	Weight = (ItemCount *) calloc(MaxItem+1, sizeof(ItemCount));

	Freq  = (ItemCount **) calloc(MaxDiscrVal+1, sizeof(ItemCount *));
	ForEach(v, 0, MaxDiscrVal)
	{
		Freq[v]  = (ItemCount *) calloc(MaxClass+1, sizeof(ItemCount));
	}

	ValFreq = (ItemCount *) calloc(MaxDiscrVal+1, sizeof(ItemCount));
	ClassFreq = (ItemCount *) calloc(MaxClass+1, sizeof(ItemCount));

	Slice1 = (ItemCount *) calloc(MaxClass+2, sizeof(ItemCount));
	Slice2 = (ItemCount *) calloc(MaxClass+2, sizeof(ItemCount));

	UnknownRate = (float *) calloc(MaxAtt+1, sizeof(float));

	/*  Check whether all attributes have many discrete values  */

	MultiVal = true;
	if ( ! SUBSET )
	{
		for ( a = 0 ; MultiVal && a <= MaxAtt ; a++ )
		{
			if ( SpecialStatus[a] != IGNORE )
			{
				MultiVal = MaxAttVal[a] >= 0.3 * (MaxItem + 1);
			}
		}
	}
}



/*************************************************************************/
/*								 	 */
/*		Initialise the weight of each item		 	 */
/*								 	 */
/*************************************************************************/


void InitialiseWeights()
{
	ItemNo i;

	ForEach(i, 0, MaxItem)
	{
		Weight[i] = 1.0;
	}
}



/*************************************************************************/
/*								 	 */
/*  Build a decision tree for the cases Fp through Lp:		 	 */
/*								 	 */
/*  - if all cases are of the same class, the tree is a leaf and so	 */
/*      the leaf is returned labelled with this class		 	 */
/*								 	 */
/*  - for each attribute, calculate the potential information provided 	 */
/*	by a test on the attribute (based on the probabilities of each	 */
/*	case having a particular value for the attribute), and the gain	 */
/*	in information that would result from a test on the attribute	 */
/*	(based on the probabilities of each case with a particular	 */
/*	value for the attribute being of a particular class)		 */
/*								 	 */
/*  - on the basis of these figures, and depending on the current	 */
/*	selection criterion, find the best attribute to branch on. 	 */
/*	Note:  this version will not allow a split on an attribute	 */
/*	unless two or more subsets have at least MINOBJS items. 	 */
/*								 	 */
/*  - try branching and test whether better than forming a leaf	 	 */
/*								 	 */
/*************************************************************************/



Tree FormTree(ItemNo Fp, ItemNo Lp)
{ 
// 	ItemNo i, Kp, Ep, Group();
	ItemNo i, Kp, Ep;
	ItemCount Cases, NoBestClass, KnownCases/*, CountItems()*/;
	float Factor, BestVal, Val, AvGain=0/*, Worth()*/;
	Attribute Att, BestAtt, Possible=0;
	ClassNo c, BestClass;
	Tree Node/*, Leaf()*/;
	DiscrValue v;
	Boolean PrevAllKnown;

	Cases = CountItems(Fp, Lp);

	/*  Generate the class frequency distribution  */

	ForEach(c, 0, MaxClass)
	{
		ClassFreq[c] = 0;
	}
	ForEach(i, Fp, Lp)
	{ 
		ClassFreq[ Class(Item[i]) ] += Weight[i];
	} 

	/*  Find the most frequent class  */

	BestClass = 0;
	ForEach(c, 0, MaxClass)
	{
		if ( ClassFreq[c] > ClassFreq[BestClass] )
		{
			BestClass = c;
		}
	}
	NoBestClass = ClassFreq[BestClass];

	Node = Leaf(ClassFreq, BestClass, Cases, Cases - NoBestClass);

	/*  If all cases are of the same class or there are not enough
	cases to divide, the tree is a leaf  */

	if ( NoBestClass == Cases  || Cases < 2 * MINOBJS )
	{ 
		return Node;
	} 

	Verbosity(1)
	{
		strTemp0.Format(_T("\n%d items, total weight %.1f\n"), Lp - Fp + 1, Cases);
		vecstrOutput.push_back(strTemp0);
	}
	/*  For each available attribute, find the information and gain  */

	ForEach(Att, 0, MaxAtt) 
	{ 
		Gain[Att] = -Epsilon;

		if ( SpecialStatus[Att] == IGNORE ) continue;

		if ( MaxAttVal[Att] )
		{
			/*  discrete valued attribute  */

			if ( SUBSET && MaxAttVal[Att] > 2 )
			{
				EvalSubset(Att, Fp, Lp, Cases);
			}
			else
				if ( ! Tested[Att] )
				{
					EvalDiscreteAtt(Att, Fp, Lp, Cases);
				}
		}
		else
		{ 
			/*  continuous attribute  */

			EvalContinuousAtt(Att, Fp, Lp);
		} 

		/*  Update average gain, excluding attributes with very many values  */

		if ( Gain[Att] > -Epsilon &&
			( MultiVal || MaxAttVal[Att] < 0.3 * (MaxItem + 1) ) )
		{
			Possible++;
			AvGain += Gain[Att];
		}
	} 

	/*  Find the best attribute according to the given criterion  */

	BestVal = -Epsilon;
	BestAtt = None;
	AvGain  = ( Possible ? AvGain / Possible : 1E6 );

	Verbosity(2)
	{
		if ( AvGain < 1E6 ) 
		{
			strTemp0.Format(_T("\taverage gain %.3f\n"), AvGain);
			vecstrOutput.push_back(strTemp0);
		}
	}

	ForEach(Att, 0, MaxAtt) 
	{ 
		if ( Gain[Att] > -Epsilon )
		{ 
			Val = Worth(Info[Att], Gain[Att], AvGain);
			if ( Val > BestVal ) 
			{ 
				BestAtt  = Att; 
				BestVal = Val;
			} 
		} 
	} 

	/*  Decide whether to branch or not  */ 

	if ( BestAtt != None )
	{ 
		Verbosity(1)
		{
			strTemp0.Format(_T("\tbest attribute %s"), AttName[BestAtt]);
			vecstrOutput.push_back(strTemp0);
			if ( ! MaxAttVal[BestAtt] )
			{
				strTemp0.Format(_T(" cut %.3f"), Bar[BestAtt]);
				vecstrOutput.push_back(strTemp0);
			}
			strTemp0.Format(_T(" inf %.3f gain %.3f val %.3f\n"),
				Info[BestAtt], Gain[BestAtt], BestVal);
			vecstrOutput.push_back(strTemp0);
		}	

		/*  Build a node of the selected test  */

		if ( MaxAttVal[BestAtt] )
		{
			/*  Discrete valued attribute  */

			if ( SUBSET && MaxAttVal[BestAtt] > 2 )
			{
				SubsetTest(Node, BestAtt);
			}
			else
			{
				DiscreteTest(Node, BestAtt);
			}
		}
		else
		{ 
			/*  Continuous attribute  */

			ContinTest(Node, BestAtt);
		} 

		/*  Remove unknown attribute values  */

		PrevAllKnown = AllKnown;

		Kp = Group(0, Fp, Lp, Node) + 1;
		if ( Kp != Fp ) AllKnown = false;
		KnownCases = Cases - CountItems(Fp, Kp-1);
		UnknownRate[BestAtt] = (Cases - KnownCases) / (Cases + 0.001);

		Verbosity(1)
		{
			if ( UnknownRate[BestAtt] > 0 )
			{
				strTemp0.Format(_T("\tunknown rate for %s = %.3f\n"),
					AttName[BestAtt], UnknownRate[BestAtt]);
				vecstrOutput.push_back(strTemp0);
			}
		}

		/*  Recursive divide and conquer  */

		++Tested[BestAtt];

		Ep = Kp - 1;
		Node->Errors = 0;

		ForEach(v, 1, Node->Forks)
		{
			Ep = Group(v, Kp, Lp, Node);

			if ( Kp <= Ep )
			{
				Factor = CountItems(Kp, Ep) / KnownCases;

				ForEach(i, Fp, Kp-1)
				{
					Weight[i] *= Factor;
				}

				Node->Branch[v] = FormTree(Fp, Ep);
// 				Node->Branch[v] = Leaf(Node->ClassDist, BestClass,CountItems(Fp,Lp),Node->Errors);
				
				Node->Errors += Node->Branch[v]->Errors;

				Group(0, Fp, Ep, Node);
				ForEach(i, Fp, Kp-1)
				{
					Weight[i] /= Factor;
				}
			}
			else
			{
				Node->Branch[v] = Leaf(Node->ClassDist, BestClass, 0.0, 0.0);
			}
		}

		--Tested[BestAtt];
		AllKnown = PrevAllKnown;

		/*  See whether we would have been no worse off with a leaf  */

		if ( Node->Errors >= Cases - NoBestClass - Epsilon )
		{ 
			Verbosity(1)
			{
				strTemp0.Format(_T("Collapse tree for %d items to leaf %s\n"),
				Lp - Fp + 1, ClassName[BestClass]);
				vecstrOutput.push_back(strTemp0);
			}

			Node->NodeType = 0;
		} 
	}
	else
	{ 
		Verbosity(1)
		{
			strTemp0.Format(_T("\tno sensible splits  %.1f/%.1f\n"),
			Cases, Cases - NoBestClass);
			vecstrOutput.push_back(strTemp0);
		}
	} 

	return Node; 
} 



/*************************************************************************/
/*								 	 */
/*  Group together the items corresponding to branch V of a test 	 */
/*  and return the index of the last such			 	 */
/*								 	 */
/*  Note: if V equals zero, group the unknown values		 	 */
/*								 	 */
/*************************************************************************/

ItemNo Group(DiscrValue V, ItemNo Fp, ItemNo Lp, Tree TestNode)
{
	ItemNo i;
	Attribute Att;
	float Thresh;
	Set SS;
	//void Swap();

	Att = TestNode->Tested;

	if ( V )
	{
		/*  Group items on the value of attribute Att, and depending
		on the type of branch  */

		switch ( TestNode->NodeType )
		{
		case BrDiscr:

			ForEach(i, Fp, Lp)
			{
				if ( DVal(Item[i], Att) == V ) Swap(Fp++, i);
			}
			break;

		case ThreshContin:

			Thresh = TestNode->Cut;
			ForEach(i, Fp, Lp)
			{
				if ( (CVal(Item[i], Att) <= Thresh) == (V == 1)) Swap(Fp++, i);
			}
			break;

		case BrSubset:

			SS = TestNode->Subset[V];
			ForEach(i, Fp, Lp)
			{
				if ( In(DVal(Item[i], Att), SS) ) Swap(Fp++, i);
			}
			break;
		}
	}
	else
	{
		/*  Group together unknown values  */

		switch ( TestNode->NodeType )
		{
		case BrDiscr:
		case BrSubset:

			ForEach(i, Fp, Lp)
			{
				if ( !DVal(Item[i], Att) ) Swap(Fp++, i);
			}
			break;

		case ThreshContin:

			ForEach(i, Fp, Lp)
			{
				if ( CVal(Item[i], Att) == Unknown ) Swap(Fp++, i);
			}
			break;
		}
	}

	return Fp - 1;
}



/*************************************************************************/
/*								 	 */
/*	Return the total weight of items from Fp to Lp		 	 */
/*								 	 */
/*************************************************************************/


ItemCount CountItems(ItemNo Fp, ItemNo Lp)
{
	register ItemCount Sum=0.0, *Wt, *LWt;

	if ( AllKnown ) return Lp - Fp + 1;
	LWt = Weight + Lp;
	for ( Wt = Weight + Fp; Wt <= LWt ; )
	{
		Sum += *Wt++;
	}

	return Sum;
// 	else
// 	{
// 		for ( Wt = Weight + Fp, LWt = Weight + Lp ; Wt <= LWt ; )
// 		{
// 			Sum += *Wt++;
// 		}
// 
// 		return Sum;
// 	}

}



/*************************************************************************/
/*                                                               	 */
/*		Exchange items at a and b			 	 */
/*									 */
/*************************************************************************/


void Swap(ItemNo a, ItemNo b)
{
	register Description Hold;
	register ItemCount HoldW;

	Hold = Item[a];
	Item[a] = Item[b];
	Item[b] = Hold;

	HoldW = Weight[a];
	Weight[a] = Weight[b];
	Weight[b] = HoldW;
}

#endif